library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(rpart)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
vaccine<-read.csv("COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv")
case_death<-read.csv("United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv")
USstate=c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS",
          "KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY",
          "NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV",
          "WI","WY")
case_death<-case_death %>% 
  transmute(date=mdy(submission_date),
            state=state,
            tot_case=tot_cases,
            tot_death=tot_death,
            new_case=new_case,
            new_death=new_death
            ) %>%
  filter(state %in% USstate) %>%
  filter(date >= as.Date("2021-01-01") & date<=as.Date("2021-11-30"))
vaccine<-vaccine %>% 
  dplyr::select(c(1,3,13,14,15,16,17,18,19,26,28,30,32,34,36,38,40,42,43,44,46,47,48,50,51,52,54,55,56,58,60,62,64,66,67,68)) %>%
  rename(state=Location) %>% 
  mutate(date=mdy(Date)) %>%
  dplyr::select(-Date)

Combine two tables into #comb# #new_comb_ma# contains only data in MA

comb<-right_join(vaccine,case_death,by=c("date","state"))
#write.csv(comb,"bigtable.csv",row.names = FALSE)
new_comb_ma<-comb%>%
  filter(state=="MA")%>%
  arrange(date)
  #mutate(new_case=tot_case-lag(tot_case)) %>%
  #mutate(new_death=tot_death-lag(tot_death))%>%
  #mutate(new_adm=Administered-lag(Administered)) %>%
  #mutate(new_jj=Administered_Janssen-lag(Administered_Janssen)) %>%
  #mutate(new_moderna=Administered_Moderna-lag(Administered_Moderna))%>%
  #mutate(new_pfizer=Administered_Pfizer-lag(Administered_Pfizer))

The number of new cases against date in MA

new_comb_ma%>%ggplot(aes(x=date))+
  geom_point(aes(y=new_case))+
  geom_vline(xintercept=as.Date("2021-07-03"),color="red")+
  geom_text(aes(x=as.Date("2021-04-30"),y=6700,label="Delta variant became dominant"))

The number of new death against date in MA

new_comb_ma%>%ggplot(aes(date,new_death))+
  geom_point()+
  geom_vline(xintercept=as.Date("2021-07-03"),color="red")+
  geom_text(aes(x=as.Date("2021-08-30"),y=70,label="Delta variant became dominant"))

The change of new cases across all states

comb%>%
  ggplot(aes(date,new_case,group=state))+
  geom_line(color='darkseagreen3',alpha=0.6)+
  geom_vline(xintercept=as.Date("2021-07-03"),color="red")+
  geom_text(aes(x=as.Date("2021-04-30"),y=40000,label="Delta variant became dominant"))

The change of new deaths across all states

comb%>%
  ggplot(aes(date,new_death,group=state))+
  geom_line(color='darkseagreen3',alpha=0.6)+
  geom_vline(xintercept=as.Date("2021-07-03"),color="red")+
  geom_text(aes(x=as.Date("2021-08-30"),y=700,label="Delta variant became dominant"))

# Distinguish states and represent new case using color
library(RColorBrewer)
comb %>% ggplot(aes(date, state,  fill = new_case)) + 
  geom_tile(color = "grey50") + 
  scale_x_continuous(expand = c(0,0)) + 
  scale_fill_gradientn(colors = brewer.pal(9, "Oranges"), trans = "sqrt") + 
  #geom_text(aes(x=as.Date("2021-08-30"), y=700, label="Delta variant became dominant")) + 
  theme_minimal() +  theme(panel.grid = element_blank()) + 
  xlab("") + ylab("")
## Warning in self$trans$transform(x): 产生了NaNs
## Warning: Transformation introduced infinite values in discrete y-axis

Now, we are going to do some visualizations using maps.

# Pull out us state map data frame
us_states <- map_data("state")

us_states %>% ggplot(aes(x = long, y = lat, fill = region, group = group)) + 
              geom_polygon(color = "white", fill = "grey") + 
              coord_fixed(1.3) +
              guides(fill = FALSE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Before joining the map data with the CDC data, we need to make sure that the states match successfully with each other whenever possible.

x <- case_death$state
case_death <- case_death %>% mutate(region = tolower(state.name[match(x, state.abb)]))
y <- vaccine$state
vaccine <- vaccine %>% mutate(region = tolower(state.name[match(y, state.abb)])) %>%
  mutate()

Then we can combine the two dataset using join functions.

# Choose two dates (one before delta dominant, one after)
dates <- c(ymd("2021-06-01", "2021-09-01"))

# Filter case_death data for the two dates and join with us_states data frame.
comb_cd <- case_death %>% filter(date %in% dates)
comb_cd <- left_join(comb_cd, us_states, by = "region")

# Filter vaccine data for the two dates and join with us_states data frame.
comb_vac <- vaccine %>% filter(date %in% dates)
comb_vac <- left_join(comb_vac, us_states, by = "region")

First we explore the map of new cases and deaths.

# Heatmap of new cases on June 1, 2021 and Sept. 1, 2021.
comb_cd %>% ggplot(aes(x = long, y = lat, group = group, fill = new_case)) + 
  geom_polygon(color = "white") + 
  coord_fixed(1.3) +
  facet_grid(date ~ .) +
  ggtitle("Map of new cases") +
  scale_fill_viridis_c(name = "New cases", option = "inferno", direction = -1, trans = "sqrt")

# Heatmap of new deaths on June 1, 2021 and Sept. 1, 2021.
comb_cd %>% ggplot(aes(x = long, y = lat, group = group, fill = new_death)) + 
  geom_polygon(color = "white") + 
  coord_fixed(1.3) +
  facet_grid(date ~ .) +
  ggtitle("Map of new deaths") +
  scale_fill_viridis_c(name = "New deaths", option = "rocket", direction = -1, trans = "sqrt")
## Warning in self$trans$transform(x): 产生了NaNs
## Warning: Transformation introduced infinite values in discrete y-axis

Then we explore the map of vaccination.

# Filter case_death data on Sept. 1, 2021 and join with us_states data frame.
comb_cd <- case_death %>% filter(date == "2021-09-01")
comb_cd <- left_join(comb_cd, us_states, by = "region")

# Filter vaccine data for the two dates and join with us_states data frame.
comb_vac <- vaccine %>% filter(date == "2021-09-01")
comb_vac <- left_join(comb_vac, us_states, by = "region")
# Heatmap of vaccine (regardless of brands) on June 1, 2021 and Sept. 1, 2021.
comb_vac %>% ggplot(aes(x = long, y = lat, group = group, fill = Administered)) + 
  geom_polygon(color = "white") + 
  coord_fixed(1.3) +
  ggtitle("Map of total vaccines for each state") +
  scale_fill_viridis_c(name = "Vaccination", option = "mako", direction = -1, trans = "log")

See the difference between vaccine brands.

# Heatmap of vaccine (Moderna) on Sept. 1, 2021.
p1 <- comb_vac %>% ggplot(aes(x = long, y = lat, group = group, fill = Administered_Moderna)) + 
  geom_polygon(color = "white") + 
  coord_fixed(1.3) +
  ggtitle("Map of Moderna vaccinations") +
  scale_fill_viridis_c(name = "Vaccination", option = "mako", direction = -1, trans = "log")
# Heatmap of vaccine (Pfizer) on Sept. 1, 2021.
p2 <- comb_vac %>% ggplot(aes(x = long, y = lat, group = group, fill = Administered_Pfizer)) + 
  geom_polygon(color = "white") + 
  coord_fixed(1.3) +
  ggtitle("Map of Pfizer vaccinations") +
  scale_fill_viridis_c(name = "Vaccination", option = "mako", direction = -1, trans = "log")
grid.arrange(p1, p2, ncol = 1)
## Warning: Transformation introduced infinite values in discrete y-axis

Classify the number of new cases into three categories (1: new case<=10000 on that day, 2: 10000<new case<=20000 on that day, 3: new case>20000 on that day) Also classify the number of new deaths into three categories (1: new death<=200 on that day, 2: 200<new death<=400 on that day, 3: new death>400 on that day)

clascomb<-comb%>%
  mutate(new_case_clas=ifelse(new_case<=10000,1,ifelse(new_case>20000,3,2)))%>%
  mutate(new_death_clas=ifelse(new_death<=200,1,ifelse(new_death>400,3,2)))

Randomly divide the table into one train set (70% data) and one test set(30% data)

inTrain   <- createDataPartition(y = clascomb$new_case, p = 0.7, times = 1, list = FALSE)
train_set <- slice(clascomb, inTrain)
test_set  <- slice(clascomb, -inTrain)

Classification using decision tree model

rpart_fit <- rpart(new_case_clas ~ Administered, data = train_set)
rpart_preds <- predict(rpart_fit, test_set)
rpart_res<-ifelse(rpart_preds<1.5,1,ifelse(rpart_preds>=2.5,3,2))
#rpart_preds
confusionMatrix(data = as.factor(rpart_res), reference = as.factor(test_set$new_case_clas),positive="1")
## Warning in levels(reference) != levels(data): 长的对象长度不是短的对象长度的整倍
## 数
## Warning in confusionMatrix.default(data = as.factor(rpart_res), reference
## = as.factor(test_set$new_case_clas), : Levels are not in the same order for
## reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2    3
##          1 4850   51   26
##          2   61   13    7
##          3    0    0    0
## 
## Overall Statistics
##                                          
##                Accuracy : 0.971          
##                  95% CI : (0.966, 0.9755)
##     No Information Rate : 0.9806         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1733         
##                                          
##  Mcnemar's Test P-Value : 2.087e-07      
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            0.9876 0.203125 0.000000
## Specificity            0.2062 0.986246 1.000000
## Pos Pred Value         0.9844 0.160494      NaN
## Neg Pred Value         0.2469 0.989649 0.993411
## Prevalence             0.9806 0.012780 0.006589
## Detection Rate         0.9685 0.002596 0.000000
## Detection Prevalence   0.9838 0.016174 0.000000
## Balanced Accuracy      0.5969 0.594685 0.500000
test_set%>%
  ggplot()+geom_point(aes(date,rpart_preds),color="red")+geom_point(aes(date,new_case_clas))

Classification using random forest model

rf_fit <- randomForest(new_case_clas ~ Administered, data = train_set)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
rf_preds <- predict(rf_fit, test_set)
rf_res<-ifelse(rf_preds<1.5,1,ifelse(rf_preds>=2.5,3,2))
#rpart_preds
confusionMatrix(data = as.factor(rf_res), reference = as.factor(test_set$new_case_clas),positive="1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2    3
##          1 4818   53   29
##          2   91   11    4
##          3    2    0    0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9643          
##                  95% CI : (0.9587, 0.9692)
##     No Information Rate : 0.9806          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1118          
##                                           
##  Mcnemar's Test P-Value : 3.53e-08        
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2  Class: 3
## Sensitivity            0.9811 0.171875 0.0000000
## Specificity            0.1546 0.980785 0.9995980
## Pos Pred Value         0.9833 0.103774 0.0000000
## Neg Pred Value         0.1389 0.989188 0.9934079
## Prevalence             0.9806 0.012780 0.0065895
## Detection Rate         0.9621 0.002196 0.0000000
## Detection Prevalence   0.9784 0.021166 0.0003994
## Balanced Accuracy      0.5679 0.576330 0.4997990
test_set%>%
  ggplot()+geom_point(aes(date,rf_preds),color="red")+geom_point(aes(date,new_case_clas))

Regression using random forest

newcase_rf<-randomForest(new_case~Administered, data=train_set)
newcase_rf_pred<-predict(newcase_rf,test_set)
test_set%>%
  ggplot()+geom_point(aes(date,newcase_rf_pred),color="red")+geom_point(aes(date,new_case))

Divide the table into trainset and testset by date (cutoff point: 2021-07-31)

trainset<-clascomb%>%
  filter(date <= as.Date("2021-7-31"))
testset<-clascomb%>%
  filter(date >as.Date("2021-07-31"))
#trainset

Machine learning using linear regression

linearfit <- lm(new_case ~ Administered, data = trainset)
linear_pred <- predict(linearfit, testset)
data.frame(
  RMSE = RMSE(linear_pred, testset$new_case),
  R2 = R2(linear_pred, testset$new_case)
)
##       RMSE        R2
## 1 2959.376 0.3180471
#testset%>%
#  ggplot()+geom_line(aes(date,rpart_preds1),color="red")+geom_point(aes(date,new_case))
ggplot(testset, aes(date, new_case) ) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ x)

Machine learning using random forest

regr_rf<-randomForest(new_case~Administered, data=trainset)
regr_rf_pred<-predict(newcase_rf,testset)
testset%>%
  ggplot()+geom_point(aes(date,regr_rf_pred),color="red")+geom_point(aes(date,new_case))

plot(regr_rf)

sqrt(sum((regr_rf_pred - testset$new_case)^2) / nrow(testset))
## [1] 2083.902